Load R packages and define colour functions

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(colorspace) ; library(corrplot)
library(GGally) ; library(ggpubr) ; library(ggExtra)
library(WGCNA)
library(expss)
library(polycor)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)
library(foreach) ; library(doParallel)
library(knitr) ; library(kableExtra) ; library(xtable)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

# Get colors from the ggplot palette
gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n+1)
  pal = hcl(h = hues, l = 65, c = 100)[1:n]
}

# Assign an HCL rainbow colour to each module
get_mod_colours = function(mods){
  
  n = length(unique(mods))-1
  set.seed(123) ; rand_order = sample(1:n)
  mod_colors = c('white', gg_colour_hue(n)[rand_order])
  names(mod_colors) = mods %>% table %>% names
  
  return(mod_colors)
}

# Compare results from GSEA and ORA
compare_methods = function(GSEA_list, ORA_list, top_modules_enrichment, top_modules, database){

  for(module in top_modules){
    cat(paste0('  \n  \n Enrichment results for cluster ', 
                 genes_info$module_number[genes_info$Module==module][1], ':  \n'))
    
    cat(paste0('- GSEA has ', nrow(GSEA_list[[module]][[database]]@result), ' enriched term(s)  \n'))
    cat(paste0('- ORA has  ', nrow(ORA_list[[module]][[database]]@result), ' enriched term(s)  \n'))
    cat(paste0('- ', nrow(top_modules_enrichment[[module]][[database]]), 
               ' terms are enriched in both methods  \n  \n'))

    enriched_terms = top_modules_enrichment[[module]][[database]] %>%
                     dplyr::select(ID, Description.x, p.adjust_ORA, p.adjust_GSEA, qvalue_ORA, GeneRatio) %>%
                     dplyr::rename('Description' = Description.x)
    
    if(nrow(enriched_terms)>0){
      print(enriched_terms %>% mutate(pval_mean = p.adjust_ORA + p.adjust_GSEA) %>% 
                          arrange(pval_mean) %>% dplyr::select(-pval_mean) %>% 
            kable %>% kable_styling(full_width = F))
      
      ##########################################################################################################
      # Get genes involved
      genes = c()
      i=1
      for(row_genes in top_modules_enrichment[[module]][[database]] %>% pull(geneID)){
        genes = c(genes, strsplit(row_genes,'/') %>% unlist) %>% unique
        if(i==5){
          cat(paste0('Genes involved in top 5 enriched terms: ',
                     paste(gene_names %>% filter(entrezgene %in% genes) %>% pull(hgnc_symbol) %>% unique %>% 
                           sort, collapse = ', '),'\n'))
        }
        i = i+1
      }
      
      if(i != 5){
        genes = gene_names %>% filter(entrezgene %in% genes) %>% pull(hgnc_symbol) %>% unique %>% sort
        cat(paste0('\nGenes involved in all enriched terms: ', paste(genes, collapse = ', ')))  
      }
      ##########################################################################################################
    }
    
  }
}

plot_results = function(top_modules_enrichment, top_modules, database){
  
  l = htmltools::tagList()

  for(i in 1:length(top_modules)){
    
    plot_data = top_modules_enrichment[[top_modules[i]]][[database]] %>%
                dplyr::rename('Description' = Description.x)
    
    if(nrow(plot_data)>5){
      min_val = min(min(plot_data$p.adjust_GSEA), min(plot_data$p.adjust_ORA))
      max_val = max(max(max(plot_data$p.adjust_GSEA), max(plot_data$p.adjust_ORA)), 0.05)
      ggp = ggplotly(plot_data %>% ggplot(aes(p.adjust_GSEA, p.adjust_ORA, color = NES)) + 
                     geom_point(aes(id = Description)) + 
                     geom_vline(xintercept = 0.05, color = 'gray', linetype = 'dotted') + 
                     geom_hline(yintercept = 0.05, color = 'gray', linetype = 'dotted') + 
                     ggtitle(paste0('Enriched terms in common for cluster ', 
                                    genes_info$module_number[genes_info$Module==top_modules[i]][1])) +
                     scale_x_continuous(limits = c(min_val, max_val)) + 
                     scale_y_continuous(limits = c(min_val, max_val)) + 
                     xlab('Corrected p-value for GSEA') + ylab('Corrected p-value for ORA') +
                     scale_colour_viridis(direction = -1) + theme_minimal() + coord_fixed())
      l[[i]] = ggp
    }
  }
  
  return(l)
}


# plot_shared_genes(top_modules_enrichment, top_modules, 'GO')
plot_shared_genes = function(top_modules_enrichment, top_modules, database){

  for(tm in 1:length(top_modules)){
    
    plot_data = top_modules_enrichment[[top_modules[tm]]][[database]] %>% 
                mutate(pval_mean = p.adjust_ORA + p.adjust_GSEA) %>% arrange(pval_mean) %>% 
                dplyr::select(ID, geneID)
    
    if(nrow(plot_data)>=2){

      plot_data = plot_data %>% slice_head(n=5)
    
      shared_genes = matrix(0, nrow(plot_data), nrow(plot_data))
      for(i in 1:(nrow(plot_data)-1)){
        for(j in (i+1):nrow(plot_data)){
          gene_set_1 = strsplit(plot_data$geneID[i], '/') %>% unlist
          gene_set_2 = strsplit(plot_data$geneID[j], '/') %>% unlist
          shared_genes[i,j] = sum(gene_set_1 %in% gene_set_2)/length(unique(c(gene_set_1, gene_set_2)))
          shared_genes[j,i] = shared_genes[i,j]
        }
      }
      rownames(shared_genes) = plot_data$ID
      colnames(shared_genes) = plot_data$ID
  
      corrplot(shared_genes, type = 'lower', method = 'square', diag = FALSE, number.digits = 2, cl.pos = 'n', 
               tl.pos = 'ld', tl.col = '#666666', order = 'hclust', col.lim = c(0,1), addCoef.col = 'black',
               mar = c(0,0,2,0), tl.cex = 0.8, number.cex= 0.8,
               title = paste0('Genes in common for top terms in cluster ',
                              genes_info$module_number[genes_info$Module==top_modules[tm]][1]))
    }
  }
}

# Print table with top results (for annex in thesis)
print_table_w_top_results = function(top_modules_enrichment, module, database, n){
  
  enriched_terms = top_modules_enrichment[[module]][[database]] %>%
                   mutate(pval_mean = p.adjust_ORA + p.adjust_GSEA) %>% arrange(pval_mean) %>%
                   top_n(-n, wt=pval_mean) %>% dplyr::rename('Description' = Description.x) %>%
                   dplyr::select(ID, Description, p.adjust_GSEA, p.adjust_ORA, NES, GeneRatio) %>%
                   xtable(display =c('s','s','s','e','e','f','s'))

  return(print(enriched_terms, include.rownames=FALSE))
}

#print_table_w_top_results(selected_modules_enrichment, names(selected_modules_enrichment)[2], 'DN', 5)

Load preprocessed dataset (code in 2.1.Preprocessing_pipeline.Rmd)

# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')

# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame

# WGCNA metrics
WGCNA_metrics = read.csv('./../Data/preprocessedData/WGCNA_metrics.csv')

# Updates genes_info with SFARI information and clusters
genes_info = genes_info %>% left_join(SFARI_genes, by = 'ID') %>% 
             left_join(datGenes %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, hgnc_symbol), by = 'ID') %>%
             dplyr::select(ID, hgnc_symbol, log2FoldChange, shrunken_log2FoldChange, significant, Neuronal) %>%
             left_join(WGCNA_metrics, by = 'ID') %>% dplyr::select(-contains('pval'))


################################################################################################################
# Get entrezene ID of genes
gene_names = genes_info %>% dplyr::rename('ensembl_gene_id' = ID) %>% filter(Module!='gray')
  
# ClusterProfile works with Entrez Gene Ids, o we have to assign one to each gene
getinfo = c('ensembl_gene_id','entrezgene')
mart=useMart(biomart='ENSEMBL_MART_ENSEMBL',dataset='hsapiens_gene_ensembl',host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), 
                       values=gene_names$ensembl_gene_id, mart=mart)

gene_names = biomart_output %>% left_join(gene_names %>% dplyr::select(ensembl_gene_id, hgnc_symbol), 
                                          by='ensembl_gene_id') %>% dplyr::rename('ID'=ensembl_gene_id)

rm(getinfo, mart, biomart_output)



rm(dds, WGCNA_metrics)




Methodology

Both GSEA and ORA are commonly used to study enrichment in sets of genes, but when using them for studying our modules both have shortcomings:

module = genes_info %>% filter(abs(MTcor) > 0.7) %>% slice_head(n=1) %>% pull(Module) %>% as.character

plot_data = genes_info %>% dplyr::select(Module, paste0('MM.',gsub('#','',module))) %>% 
            mutate(in_module = substring(Module,2) == gsub('#','',module), 
                   selected_module = paste('Cluster', genes_info$module_number[genes_info$Module==module][1] %>% 
                                             as.character)) %>%
            mutate(alpha = ifelse(in_module, 0.8, 0.1))
colnames(plot_data)[2] = 'MM'

p = plot_data %>% ggplot(aes(selected_module, MM, color = in_module)) + geom_jitter(alpha = plot_data$alpha) + 
    xlab('') + ylab(paste('Cluster membership to cluster', 
                          genes_info$module_number[genes_info$Module==module][1])) + coord_flip() + 
    theme_minimal() + theme(legend.position = 'bottom', axis.text.y = element_blank(),
                            axis.ticks.y = element_blank()) + 
    labs(color = paste('Gene belongs to cluster', genes_info$module_number[genes_info$Module==module][1]))

ggExtra::ggMarginal(p, type = 'density', groupColour = TRUE, groupFill = TRUE, margins = 'x', size=1)

rm(modules, module, p, plot_data)

So perhaps it could be useful to use both methods together, since they seem to complement each other’s shortcomings very well, performing the enrichment using both methods and identifying the terms that are found to be enriched by both

Note: Since the enrichment in both methods is quite a stric restriction, we decide to relax the corrected p-value threshold (using Bonferroni correction) to 0.1.


Perform Enrichment Analysis

Note: This script may take a bit to run (~30 mins with an 8 core Intel(R) Core(TM) i5-8400H CPU @ 2.50GHz laptop) and sometimes there are problems with the API and it will freeze or kill the process printing ‘error writing to connection’, but this when this has happened, it has been fixed in less than a day (except once that took 4 days…).

#top_modules = c('#44A0FF', '#D177FF', '#F47B5B', '#00BADE', '#64B200', '#DD71FA')
# Modules: 6,38,63,22,34,46
top_modules = c('#00B2F3','#529EFF','#CA9700','#00BF75','#1BB700','#8893FF')

if(file.exists('./../Data/preprocessedData/top_modules_enrichment.RData')){
  load('./../Data/preprocessedData/top_modules_enrichment.RData')
  load('./../Data/preprocessedData/GSEA_results.RData')
  load('./../Data/preprocessedData/ORA_results.RData')
} else{
    
  ################################################################################################################
  # Prepare dataset for Enrichment Analysis
  
  EA_dataset = genes_info %>% dplyr::rename('ensembl_gene_id' = ID) %>% filter(Module!='gray')
  
  # ClusterProfile works with Entrez Gene Ids, o we have to assign one to each gene
  getinfo = c('ensembl_gene_id','entrezgene')
  mart=useMart(biomart='ENSEMBL_MART_ENSEMBL',dataset='hsapiens_gene_ensembl',host='feb2014.archive.ensembl.org')
  biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), 
                         values=EA_dataset$ensembl_gene_id, mart=mart)
  
  EA_dataset = biomart_output %>% left_join(EA_dataset, by='ensembl_gene_id') %>% dplyr::rename('ID'=ensembl_gene_id)
  
  rm(getinfo, mart, biomart_output)
  
  ################################################################################################################
  # GSEA enrichment
  
  file_name = './../Data/preprocessedData/GSEA_results.RData'
  if(file.exists(file_name)){
    load(file_name)
  } else {
    nPerm = 1e5
    GSEA_dataset = EA_dataset %>% dplyr::select(ID, entrezgene, contains('MM.'))
    GSEA_enrichment = list()
    
    for(module in top_modules){
      
      cat(paste0('\nModule: ', which(top_modules == module), '/', length(top_modules)))
      
      geneList = GSEA_dataset %>% pull(paste0('MM.',substring(module,2)))
      names(geneList) = GSEA_dataset %>% pull(entrezgene) %>% as.character
      geneList = sort(geneList, decreasing = TRUE)
      
      GSEA_GO = gseGO(geneList, OrgDb = org.Hs.eg.db, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1, 
                      nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_DO = gseDO(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1, 
                      nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_DGN = gseDGN(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1, 
                        nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_KEGG = gseKEGG(geneList, organism = 'human', pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1, 
                          nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_Reactome = gsePathway(geneList, organism = 'human', pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1, 
                                 nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_enrichment[[module]] = list('GO' = GSEA_GO, 'DO' = GSEA_DO, 'DGN' = GSEA_DGN, 'KEGG' = GSEA_KEGG, 
                                       'Reactome' = GSEA_Reactome)
      
      # Save after each iteration (in case it breaks)
      save(GSEA_enrichment, file = file_name)
    }
    
    rm(GSEA_dataset, nPerm, geneList, GSEA_GO, GSEA_DO, GSEA_DGN, GSEA_KEGG, GSEA_Reactome)
    
  }
  
  ################################################################################################################
  # ORA enrichment
  
  file_name = './../Data/preprocessedData/ORA_results.RData'
  if(file.exists(file_name)){
    load(file_name)
  } else {
    # Prepare input
    universe = EA_dataset$entrezgene %>% as.character
    
    # Perform Enrichment
    ORA_enrichment = list()
    
    for(module in top_modules){
      
      genes_in_module = EA_dataset %>% filter(Module == module) %>% pull(entrezgene)
      
      ORA_GO = enrichGO(gene = genes_in_module, universe = universe, OrgDb = org.Hs.eg.db, ont = 'All', 
                        pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1, qvalueCutoff = 1)
      
      ORA_DO = enrichDO(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
                        pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1)
      
      ORA_DGN = enrichDGN(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
                          pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1)
      
      ORA_KEGG = enrichKEGG(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
                            pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1) 
      
      ORA_Reactome = enrichPathway(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
                                   pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1)
      
      ORA_enrichment[[module]] = list('GO' = ORA_GO, 'DO' = ORA_DO, 'DGN' = ORA_DGN, 'KEGG' = ORA_KEGG, 
                                      'Reactome' = ORA_Reactome)
      
      # Save after each iteration
      save(ORA_enrichment, file = file_name)
    }
    
    rm(universe, genes_in_module, module, ORA_GO, ORA_DGN, ORA_DO, ORA_KEGG, ORA_Reactome)
  
  }
  
  ################################################################################################################
  # Get shared enrichment for each module
  
  top_modules_enrichment = list()
  
  for(module in top_modules){
    
    module_enrichment = list()
    GSEA_enrichment_for_module = GSEA_enrichment[[module]]
    ORA_enrichment_for_module = ORA_enrichment[[module]]
    
    for(dataset in c('KEGG', 'Reactome', 'GO', 'DO', 'DGN')){
      
      GSEA_enrichment_dataset = GSEA_enrichment_for_module[[dataset]] %>% data.frame %>%
        dplyr::rename('pvalue_GSEA' = pvalue, 'p.adjust_GSEA' = p.adjust, 'qvalues_GSEA' = qvalues)
      
      ORA_enrichment_dataset = ORA_enrichment_for_module[[dataset]] %>% data.frame %>%
        dplyr::rename('pvalue_ORA' = pvalue, 'p.adjust_ORA' = p.adjust, 'qvalue_ORA' = qvalue)
      
      # Get shared enrichments (if any)
      shared_enrichment_dataset = GSEA_enrichment_dataset %>% inner_join(ORA_enrichment_dataset, by = 'ID')
      
      module_enrichment[[dataset]] = shared_enrichment_dataset
    }
    
    top_modules_enrichment[[module]] = module_enrichment  
  }
  
  save(top_modules_enrichment, file = './../Data/preprocessedData/top_modules_enrichment.RData')
  
  rm(module, module_enrichment, GSEA_enrichment_for_module, ORA_enrichment_for_module, dataset, 
     GSEA_enrichment_dataset, ORA_enrichment_dataset, shared_enrichment_dataset)
}


3.4.1 Top clusters by cluster-diagnosis correlation


top_modules_mtcor = top_modules[1:3]


Gene Ontology

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'GO')

Enrichment results for cluster 6:
- GSEA has 245 enriched term(s)
- ORA has 20 enriched term(s)
- 14 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
GO:0060333 interferon-gamma-mediated signaling pathway 0.0000061 0.0886604 0.0000024 9/72
GO:0034341 response to interferon-gamma 0.0001111 0.0888568 0.0000171 10/72
GO:0071346 cellular response to interferon-gamma 0.0004974 0.0889945 0.0000658 9/72
GO:0019221 cytokine-mediated signaling pathway 0.0000079 0.0899264 0.0000024 18/72
GO:0045087 innate immune response 0.0000079 0.0899416 0.0000024 19/72
GO:0034097 response to cytokine 0.0000490 0.0903737 0.0000113 21/72
GO:0071345 cellular response to cytokine stimulus 0.0000882 0.0903584 0.0000163 20/72
GO:0002252 immune effector process 0.0017538 0.0907519 0.0002030 19/72
GO:0009607 response to biotic stimulus 0.0142368 0.0899074 0.0014646 14/72
GO:0043207 response to external biotic stimulus 0.0413151 0.0900083 0.0034776 13/72
GO:0051707 response to other organism 0.0413151 0.0900083 0.0034776 13/72
GO:0098542 defense response to other organism 0.0557650 0.0892505 0.0043027 9/72
GO:0050778 positive regulation of immune response 0.0624235 0.0899530 0.0044459 13/72
GO:0002460 adaptive immune response based on somatic recombination of immune receptors built from immunoglobulin superfamily domains 0.0771116 0.0890169 0.0050998 7/72

Genes involved in top 5 enriched terms: ANXA1, BST2, C1R, C1S, CD44, FLNA, GBP1, GBP2, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL18BP, IL6R, IRF7, MR1, NLRC5, PARP9, RAB27A, TNFRSF1A

Genes involved in all enriched terms: ADAM9, ALPK1, ANXA1, ANXA2, APOL1, BST2, C1R, C1S, CD44, CHI3L1, CRISPLD2, DDIT3, FLNA, GBP1, GBP2, HLA-DQB1, HLA-DRB1, HLA-DRB5, IFI35, IL18BP, IL6R, IRF7, MAOB, MR1, NEXN, NFIL3, NLRC5, PAK2, PARP9, PIK3R2, PYGL, RAB27A, SDC1, SERPINA3, SURF4, TNFRSF1A

Enrichment results for cluster 38:
- GSEA has 37 enriched term(s)
- ORA has 0 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 63:
- GSEA has 59 enriched term(s)
- ORA has 0 enriched term(s)
- 0 terms are enriched in both methods


Plots of the results when there are more than 5 terms in common between methods:

plot_results(top_modules_enrichment, top_modules_mtcor, 'GO')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'GO')


Disease Ontology

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'DO')

Enrichment results for cluster 6:
- GSEA has 194 enriched term(s)
- ORA has 359 enriched term(s)
- 8 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
DOID:77 gastrointestinal system disease 0.0100986 0.0117697 0.0047845 15/44
DOID:1909 melanoma 0.0164602 0.0117487 0.0047845 14/44
DOID:65 connective tissue disease 0.0205683 0.0117780 0.0047845 16/44
DOID:409 liver disease 0.0308422 0.0116701 0.0052775 12/44
DOID:3118 hepatobiliary disease 0.0378128 0.0116659 0.0052775 12/44
DOID:0080001 bone disease 0.0557948 0.0117725 0.0055805 14/44
DOID:557 kidney disease 0.0559777 0.0116492 0.0055805 10/44
DOID:18 urinary system disease 0.0673986 0.0116475 0.0058792 10/44

Genes involved in top 5 enriched terms: ADAM9, ANXA2, APOL1, CD44, CHI3L1, COL4A1, CTNND1, DDIT3, FLNA, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL18BP, IL6R, IRF7, NAMPT, PAK2, RAB27A, S100A10, SDC1, SERPINA3, TNFRSF1A

Genes involved in all enriched terms: ADAM9, ANXA1, ANXA2, APOL1, CD44, CHI3L1, COL4A1, CTNND1, DDIT3, FLNA, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL18BP, IL6R, IRF7, NAMPT, PAK2, PDLIM4, POR, RAB27A, S100A10, SDC1, SERPINA3, SLC26A2, TNFRSF1A

Enrichment results for cluster 38:
- GSEA has 33 enriched term(s)
- ORA has 352 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 63:
- GSEA has 93 enriched term(s)
- ORA has 436 enriched term(s)
- 0 terms are enriched in both methods


Plots of the results when there are more than 5 terms in common between methods:

plot_results(top_modules_enrichment, top_modules_mtcor, 'DO')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'DO')


Disease Gene Network

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'DGN')

Enrichment results for cluster 6:
- GSEA has 326 enriched term(s)
- ORA has 1260 enriched term(s)
- 19 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
umls:C0036202 Sarcoidosis 0.0006259 0.0453306 0.0002160 9/67
umls:C0023895 Liver diseases 0.0002951 0.0457957 0.0002036 13/67
umls:C0024115 Lung diseases 0.0033629 0.0454935 0.0007735 10/67
umls:C0677607 Hashimoto Disease 0.0064239 0.0451914 0.0008705 6/67
umls:C0021368 Inflammation 0.0075685 0.0458942 0.0008705 12/67
umls:C0021390 Inflammatory Bowel Diseases 0.0106762 0.0459959 0.0010090 14/67
umls:C0409651 Seropositive rheumatoid arthritis 0.0131302 0.0451427 0.0010090 4/67
umls:C1800706 Idiopathic Pulmonary Fibrosis 0.0141286 0.0454432 0.0010090 9/67
umls:C0162871 Aortic Aneurysm, Abdominal 0.0146213 0.0452816 0.0010090 7/67
umls:C0042373 Vascular Diseases 0.0170781 0.0455706 0.0010182 10/67
umls:C0009324 Ulcerative Colitis 0.0177055 0.0459426 0.0010182 13/67
umls:C0042164 Uveitis 0.0215871 0.0453599 0.0011459 6/67
umls:C0004153 Atherosclerosis 0.0240654 0.0463680 0.0011862 17/67
umls:C0019196 Hepatitis C 0.0350402 0.0458690 0.0015992 12/67
umls:C0034069 Pulmonary Fibrosis 0.0370794 0.0455296 0.0015992 9/67
umls:C0242583 Bare Lymphocyte Syndrome 0.0422210 0.0451923 0.0016186 4/67
umls:C2931418 Bare lymphocyte syndrome 2 0.0422210 0.0451923 0.0016186 4/67
umls:C0025007 Measles 0.0521393 0.0452571 0.0018569 6/67
umls:C0031099 Periodontitis 0.0837657 0.0453618 0.0026274 7/67

Genes involved in top 5 enriched terms: ANXA2, BAG3, C1S, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL6R, S100A10, SDC1, SLC3A2, TNFRSF1A

Genes involved in all enriched terms: ADAM9, ALPK1, ANGPTL4, ANXA1, ANXA2, APOL1, BAG3, C1S, CD44, CHI3L1, COL4A1, CRISPLD2, DDIT3, FLNA, GBP1, HAP1, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL18BP, IL6R, IRF7, MAOB, MR1, NAMPT, NEXN, NFIL3, PAK2, PARP9, S100A10, SDC1, SERPINA3, SLC3A2, SP110, TNFRSF1A, TRIM69

Enrichment results for cluster 38:
- GSEA has 28 enriched term(s)
- ORA has 1299 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 63:
- GSEA has 69 enriched term(s)
- ORA has 1610 enriched term(s)
- 0 terms are enriched in both methods


Plots of the results when there are more than 5 terms in common between methods:

plot_results(top_modules_enrichment, top_modules_mtcor, 'DGN')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'DGN')


KEGG

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'KEGG')

Enrichment results for cluster 6:
- GSEA has 42 enriched term(s)
- ORA has 175 enriched term(s)
- 8 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
hsa04640 Hematopoietic cell lineage 0.0018028 0.0062067 0.0008968 5/36
hsa05150 Staphylococcus aureus infection 0.0021765 0.0061988 0.0008968 5/36
hsa05164 Influenza A 0.0205482 0.0062136 0.0056443 6/36
hsa05322 Systemic lupus erythematosus 0.0289963 0.0062168 0.0059737 5/36
hsa05320 Autoimmune thyroid disease 0.0823203 0.0062032 0.0085058 3/36
hsa05330 Allograft rejection 0.0823203 0.0062032 0.0085058 3/36
hsa05332 Graft-versus-host disease 0.0823203 0.0062032 0.0085058 3/36
hsa05169 Epstein-Barr virus infection 0.0825746 0.0062284 0.0085058 6/36

Genes involved in top 5 enriched terms: C1R, C1S, CD44, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL6R

Genes involved in all enriched terms: C1R, C1S, CD44, HLA-DQB1, HLA-DRB1, HLA-DRB5, IL6R, IRF7, PIK3R2, TNFRSF1A

Enrichment results for cluster 38:
- GSEA has 3 enriched term(s)
- ORA has 155 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 63:
- GSEA has 6 enriched term(s)
- ORA has 209 enriched term(s)
- 0 terms are enriched in both methods


Plots of the results when there are more than 5 terms in common between methods:

plot_results(top_modules_enrichment, top_modules_mtcor, 'KEGG')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'KEGG')


Reactome

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_mtcor, 'Reactome')

Enrichment results for cluster 6:
- GSEA has 20 enriched term(s)
- ORA has 232 enriched term(s)
- 7 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
R-HSA-913531 Interferon Signaling 0.0000928 0.0201370 0.0000402 10/58
R-HSA-1280215 Cytokine Signaling in Immune system 0.0000064 0.0205247 0.0000056 18/58
R-HSA-877300 Interferon gamma signaling 0.0011405 0.0200244 0.0003294 7/58
R-HSA-202433 Generation of second messenger molecules 0.0114849 0.0199493 0.0024882 4/58
R-HSA-168249 Innate Immune System 0.0627832 0.0206425 0.0068010 16/58
R-HSA-202427 Phosphorylation of CD3 and TCR zeta chains 0.0619852 0.0597430 0.0068010 3/58
R-HSA-389948 PD-1 signaling 0.0619852 0.0995716 0.0068010 3/58

Genes involved in top 5 enriched terms: ANXA1, ANXA2, ATP6V1C2, BST2, C1R, C1S, CD44, CHI3L1, CRISPLD2, FLNA, GBP1, GBP2, HLA-DQB1, HLA-DRB1, HLA-DRB5, IFI35, IL18BP, IL6R, IRF7, NLRC5, PAK2, PIK3R2, PYGL, RAB27A, SDC1, SERPINA3, SURF4, TNFRSF1A

Genes involved in all enriched terms: ANXA1, ANXA2, ATP6V1C2, BST2, C1R, C1S, CD44, CHI3L1, CRISPLD2, FLNA, GBP1, GBP2, HLA-DQB1, HLA-DRB1, HLA-DRB5, IFI35, IL18BP, IL6R, IRF7, NLRC5, PAK2, PIK3R2, PYGL, RAB27A, SDC1, SERPINA3, SURF4, TNFRSF1A

Enrichment results for cluster 38:
- GSEA has 15 enriched term(s)
- ORA has 405 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 63:
- GSEA has 22 enriched term(s)
- ORA has 591 enriched term(s)
- 0 terms are enriched in both methods


Plots of the results when there are more than 5 terms in common between methods:

plot_results(top_modules_enrichment, top_modules_mtcor, 'Reactome')
plot_shared_genes(top_modules_enrichment, top_modules_mtcor, 'Reactome')





3.4.2 Top clusters by enrichment in SFARI Genes


top_modules_SFARI = top_modules[4:6]


Gene Ontology

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'GO')

Enrichment results for cluster 22:
- GSEA has 53 enriched term(s)
- ORA has 12 enriched term(s)
- 2 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
GO:0045944 positive regulation of transcription by RNA polymerase II 0.0002441 0.0484942 0.0002145 12/30
GO:0045444 fat cell differentiation 0.0444174 0.0607138 0.0195132 5/30

Genes involved in all enriched terms: CSRNP1, EGR1, EGR2, EGR4, FOSL2, HES1, IER2, KLF10, NPAS4, NR4A1, NR4A3, PER2, RPS6KA3

Enrichment results for cluster 34:
- GSEA has 85 enriched term(s)
- ORA has 2 enriched term(s)
- 1 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
GO:0007215 glutamate receptor signaling pathway 0.0033498 0.0645637 0.0032175 7/75

Genes involved in all enriched terms: CX3CL1, GRIA1, GRIK3, GRM2, MINK1, NLGN3, SSTR1

Enrichment results for cluster 46:
- GSEA has 13 enriched term(s)
- ORA has 0 enriched term(s)
- 0 terms are enriched in both methods


Plots of the results when there are more than 5 terms in common between methods:

plot_results(top_modules_enrichment, top_modules_SFARI, 'GO')
plot_shared_genes(top_modules_enrichment, top_modules_SFARI, 'GO')


Disease Ontology

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'DO')

Enrichment results for cluster 22:
- GSEA has 59 enriched term(s)
- ORA has 190 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 34:
- GSEA has 2 enriched term(s)
- ORA has 276 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 46:
- GSEA has 6 enriched term(s)
- ORA has 112 enriched term(s)
- 0 terms are enriched in both methods


Disease Gene Network

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'DGN')

Enrichment results for cluster 22:
- GSEA has 67 enriched term(s)
- ORA has 704 enriched term(s)
- 3 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
umls:C0476089 Endometrial Carcinoma 0.0351175 0.0264086 0.0163471 8/30
umls:C0740392 Infarction, Middle Cerebral Artery 0.0387641 0.0336964 0.0163471 4/30
umls:C0151744 Myocardial Ischemia 0.0722839 0.0272557 0.0163471 7/30

Genes involved in all enriched terms: ARC, BRAF, CTSB, DUSP6, EGR1, EGR2, EGR4, FOSL2, HES1, NR4A1, PER2, SPRY2, TRIB1

Enrichment results for cluster 34:
- GSEA has 0 enriched term(s)
- ORA has 779 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 46:
- GSEA has 9 enriched term(s)
- ORA has 424 enriched term(s)
- 0 terms are enriched in both methods


Plots of the results when there are more than 5 terms in common between methods:

plot_results(top_modules_enrichment, top_modules_SFARI, 'DGN')
plot_shared_genes(top_modules_enrichment, top_modules_SFARI, 'DGN')


KEGG

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'KEGG')

Enrichment results for cluster 22:
- GSEA has 7 enriched term(s)
- ORA has 77 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 34:
- GSEA has 19 enriched term(s)
- ORA has 178 enriched term(s)
- 1 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
hsa04080 Neuroactive ligand-receptor interaction 0.0009935 0.0041954 0.0009342 8/35

Genes involved in all enriched terms: CNR1, GABRA5, GABRB3, GRIA1, GRIK3, GRM2, LYPD6B, SSTR1

Enrichment results for cluster 46:
- GSEA has 24 enriched term(s)
- ORA has 94 enriched term(s)
- 0 terms are enriched in both methods


Plots of the results when there are more than 5 terms in common between methods:

plot_results(top_modules_enrichment, top_modules_SFARI, 'KEGG')
plot_shared_genes(top_modules_enrichment, top_modules_SFARI, 'KEGG')


Reactome

compare_methods(GSEA_enrichment, ORA_enrichment, top_modules_enrichment, top_modules_SFARI, 'Reactome')

Enrichment results for cluster 22:
- GSEA has 2 enriched term(s)
- ORA has 126 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 34:
- GSEA has 49 enriched term(s)
- ORA has 287 enriched term(s)
- 0 terms are enriched in both methods

Enrichment results for cluster 46:
- GSEA has 30 enriched term(s)
- ORA has 131 enriched term(s)
- 0 terms are enriched in both methods




——


Relaxing restrictions for clusters that weren’t enriched in any term

# Get cluster name for clusters with numbers 38, 63, and 46
selected_modules = c(genes_info %>% filter(module_number==38) %>% slice_head(1) %>% pull(Module) %>% as.character,
                     genes_info %>% filter(module_number==63) %>% slice_head(1) %>% pull(Module) %>% as.character,
                     genes_info %>% filter(module_number==46) %>% slice_head(1) %>% pull(Module) %>% as.character)

if(file.exists('./../Data/preprocessedData/top_modules_enrichment_relaxed.RData')){
  load('./../Data/preprocessedData/top_modules_enrichment_relaxed.RData')
  load('./../Data/preprocessedData/GSEA_results_relaxed.RData')
  load('./../Data/preprocessedData/ORA_results_relaxed.RData')
} else{

  pvalueCutoff = 0.5
  ################################################################################################################
  # Prepare dataset for Enrichment Analysis
  
  EA_dataset = genes_info %>% dplyr::rename('ensembl_gene_id' = ID) %>% filter(Module!='gray')
  
  # ClusterProfile works with Entrez Gene Ids, o we have to assign one to each gene
  getinfo = c('ensembl_gene_id','entrezgene')
  mart=useMart(biomart='ENSEMBL_MART_ENSEMBL',dataset='hsapiens_gene_ensembl',host='feb2014.archive.ensembl.org')
  biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), 
                         values=EA_dataset$ensembl_gene_id, mart=mart)
  
  EA_dataset = biomart_output %>% left_join(EA_dataset, by='ensembl_gene_id') %>% 
               dplyr::rename('ID'=ensembl_gene_id) %>% distinct(entrezgene, .keep_all = TRUE)
  
  rm(getinfo, mart, biomart_output)
  
  ################################################################################################################
  # GSEA enrichment
  
  file_name = './../Data/preprocessedData/GSEA_results_relaxed.RData'
  if(file.exists(file_name)){
    load(file_name)
  } else {
    cat('\n\nPerforming GSEA\n')
    
    nPerm = 1e5
    GSEA_dataset = EA_dataset %>% dplyr::select(ID, entrezgene, contains('MM.'))
    GSEA_enrichment = list()
    
    for(module in selected_modules){
      
      cat(paste0('\nModule: ', which(selected_modules == module), '/', length(selected_modules)))
      
      geneList = GSEA_dataset %>% pull(paste0('MM.',substring(module,2)))
      names(geneList) = GSEA_dataset %>% pull(entrezgene) %>% as.character
      geneList = sort(geneList, decreasing = TRUE)
      
      GSEA_GO = gseGO(geneList, OrgDb = org.Hs.eg.db, pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff, 
                      nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_DO = gseDO(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff, 
                      nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_DGN = gseDGN(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff, 
                        nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_KEGG = gseKEGG(geneList, organism = 'human', pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff, 
                          nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_Reactome = gsePathway(geneList, organism = 'human', pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff, 
                                 nPerm = nPerm, verbose = FALSE, seed = TRUE)
      
      GSEA_enrichment[[module]] = list('GO' = GSEA_GO, 'DO' = GSEA_DO, 'DGN' = GSEA_DGN, 'KEGG' = GSEA_KEGG, 
                                       'Reactome' = GSEA_Reactome)
      
      # Save after each iteration (in case it breaks)
      save(GSEA_enrichment, file = file_name)
    }
    
    rm(GSEA_dataset, nPerm, geneList, GSEA_GO, GSEA_DO, GSEA_DGN, GSEA_KEGG, GSEA_Reactome)
    
  }
  
  ################################################################################################################
  # ORA enrichment
  
  file_name = './../Data/preprocessedData/ORA_results_relaxed.RData'
  if(file.exists(file_name)){
    load(file_name)
  } else {
    cat('\n\nPerforming ORA\n')
    
    # Prepare input
    universe = EA_dataset$entrezgene %>% as.character
    
    # Perform Enrichment
    ORA_enrichment = list()
    
    for(module in selected_modules){
      
      cat(paste0('\nModule: ', which(selected_modules == module), '/', length(selected_modules)))
      
      genes_in_module = EA_dataset %>% filter(Module == module) %>% pull(entrezgene)
      
      ORA_GO = enrichGO(gene = genes_in_module, universe = universe, OrgDb = org.Hs.eg.db, ont = 'All', 
                        pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff, qvalueCutoff = 1)
      
      ORA_DO = enrichDO(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
                        pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff)
      
      ORA_DGN = enrichDGN(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
                          pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff)
      
      ORA_KEGG = enrichKEGG(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
                            pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff) 
      
      ORA_Reactome = enrichPathway(gene = genes_in_module, universe = universe, qvalueCutoff = 1,
                                   pAdjustMethod = 'bonferroni', pvalueCutoff = pvalueCutoff)
      
      ORA_enrichment[[module]] = list('GO' = ORA_GO, 'DO' = ORA_DO, 'DGN' = ORA_DGN, 'KEGG' = ORA_KEGG, 
                                      'Reactome' = ORA_Reactome)
      
      # Save after each iteration (in case it breaks)
      save(ORA_enrichment, file = file_name)
    }
    
    rm(universe, genes_in_module, module, ORA_GO, ORA_DGN, ORA_DO, ORA_KEGG, ORA_Reactome)
  
  }
  
  ################################################################################################################
  # Get shared enrichment for each module
  
  selected_modules_enrichment = list()
  
  for(module in selected_modules){
    
    module_enrichment = list()
    GSEA_enrichment_for_module = GSEA_enrichment[[module]]
    ORA_enrichment_for_module = ORA_enrichment[[module]]
    
    for(dataset in c('KEGG', 'Reactome', 'GO', 'DO', 'DGN')){
      
      GSEA_enrichment_dataset = GSEA_enrichment_for_module[[dataset]] %>% data.frame %>%
        dplyr::rename('pvalue_GSEA' = pvalue, 'p.adjust_GSEA' = p.adjust, 'qvalues_GSEA' = qvalues)
      
      ORA_enrichment_dataset = ORA_enrichment_for_module[[dataset]] %>% data.frame %>%
        dplyr::rename('pvalue_ORA' = pvalue, 'p.adjust_ORA' = p.adjust, 'qvalue_ORA' = qvalue)
      
      # Get shared enrichments (if any)
      shared_enrichment_dataset = GSEA_enrichment_dataset %>% inner_join(ORA_enrichment_dataset, by = 'ID')
      
      module_enrichment[[dataset]] = shared_enrichment_dataset
    }
    
    selected_modules_enrichment[[module]] = module_enrichment
  }
  
  save(selected_modules_enrichment, file = './../Data/preprocessedData/top_modules_enrichment_relaxed.RData')
  
  rm(module, module_enrichment, GSEA_enrichment_for_module, ORA_enrichment_for_module, dataset, 
     GSEA_enrichment_dataset, ORA_enrichment_dataset, shared_enrichment_dataset)
}

Relaxing the p-value only worked for one of the modules, the other still has zero elements in common between the GSEA and ORA results


Relaxed enrichment for cluster 38


KEGG

compare_methods(GSEA_enrichment, ORA_enrichment, selected_modules_enrichment, selected_modules[1], 'KEGG')

Enrichment results for cluster 38:
- GSEA has 7 enriched term(s)
- ORA has 155 enriched term(s)
- 1 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
hsa05168 Herpes simplex virus 1 infection 0.49945 0.0037757 0.49945 10/72

Genes involved in all enriched terms: JAK1, TP53, ZNF12, ZNF222, ZNF275, ZNF430, ZNF432, ZNF529, ZNF549, ZNF563


Relaxed enrichment for cluster 63


Disease Ontology

compare_methods(GSEA_enrichment, ORA_enrichment, selected_modules_enrichment, selected_modules[2], 'DO')

Enrichment results for cluster 63:
- GSEA has 141 enriched term(s)
- ORA has 436 enriched term(s)
- 2 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
DOID:0050615 respiratory system cancer 0.1860411 0.0193661 0.0592858 17/88
DOID:4074 pancreas adenocarcinoma 0.2226042 0.3080494 0.0592858 8/88

Genes involved in all enriched terms: BCL2, CD59, CD63, CEBPG, EML4, ENO1, ERBB2, F3, FGF2, INHBA, ITGAV, LGALS3, LIMS1, MDM2, MGST1, MTHFR, NQO1, SH3GLB1, TUBB2B

Disease Gene Network

compare_methods(GSEA_enrichment, ORA_enrichment, selected_modules_enrichment, selected_modules[2], 'DGN')

Enrichment results for cluster 63:
- GSEA has 137 enriched term(s)
- ORA has 1610 enriched term(s)
- 1 terms are enriched in both methods

ID Description p.adjust_ORA p.adjust_GSEA qvalue_ORA GeneRatio
umls:C0008925 Cleft Palate 0.2953576 0.0798431 0.1200704 11/174

Genes involved in all enriched terms: ALDH1L1, BRWD3, CBFB, CHUK, ERBB2, EVC, FGF2, MID1, MTHFR, SDC2, TGFB2

Relaxed enrichment for cluster 46


——


GSEA and ORA top enrichment results for cluster 63


Note: I am using a corrected p-value threhsold of 0.05, since the relaxation was only because we were combining these results with the ones from the ORA

load('./../Data/preprocessedData/GSEA_results.RData')
load('./../Data/preprocessedData/ORA_results.RData')

print_GSEA_top_results = function(module, n){
  
  for(database in c('GO','DO','DGN','KEGG','Reactome')){
      res = GSEA_enrichment[[module]][[database]]@result %>% filter(p.adjust<0.05 & NES>0) %>%
            dplyr::select(ID, Description, NES, p.adjust, qvalues) %>% arrange(desc(NES)) %>% top_n(n, wt=NES)  
      
      cat(paste0('\n',database,':\n'))
      
      if(nrow(res)>0){
        print(res %>% kable %>% kable_styling(full_width = F))  
        #print(xtable(res, display =c('s','s','s','f','e','e')), include.rownames=FALSE) # thesis
      } else { 
        cat('\nNo enriched terms found\n\n\n')
      }
    
  }
}

plot_shared_genes_GSEA = function(module, n){
    for(database in c('GO','DO','DGN','KEGG','Reactome')){
      plot_data = GSEA_enrichment[[module]][[database]]@result %>% filter(p.adjust<0.05 & NES>0) %>%
                   arrange(desc(NES)) %>% dplyr::select(ID, core_enrichment) %>% slice_head(n=n) 
      
      if(nrow(plot_data)>1){
    
      shared_genes = matrix(0, nrow(plot_data), nrow(plot_data))
      for(i in 1:(nrow(plot_data)-1)){
        for(j in (i+1):nrow(plot_data)){
          gene_set_1 = strsplit(plot_data$core_enrichment[i], '/') %>% unlist %>% unique
          gene_set_2 = strsplit(plot_data$core_enrichment[j], '/') %>% unlist %>% unique
          shared_genes[i,j] = sum(gene_set_1 %in% gene_set_2)/length(unique(c(gene_set_1, gene_set_2)))
          shared_genes[j,i] = shared_genes[i,j]
        }
      }
      rownames(shared_genes) = plot_data$ID
      colnames(shared_genes) = plot_data$ID
  
      corrplot(shared_genes, type = 'lower', method = 'square', diag = FALSE, number.digits = 2, cl.pos = 'n', 
               tl.pos = 'ld', tl.col = '#666666', order = 'hclust', col.lim = c(0,1), addCoef.col = 'black',
               mar = c(0,0,2,0), tl.cex = 0.8, number.cex= 0.8,
               title = paste0('Genes in common in the ',database, ' database for cluster ',
                              genes_info$module_number[genes_info$Module==module][1]))

      }
  }
}

print_ORA_top_results = function(module, n){
  
  for(database in c('GO','DO','DGN','KEGG','Reactome')){
      res = ORA_enrichment[[module]][[database]]@result %>% filter(p.adjust<0.05) %>%
            dplyr::select(ID, Description, p.adjust, qvalue, GeneRatio) %>% arrange(p.adjust) %>% 
            top_n(n, wt=p.adjust)
      
      cat(paste0('\n',database,':\n'))
      
      if(nrow(res)>0){
        print(res %>% kable %>% kable_styling(full_width = F))  
        #print(xtable(res, display =c('s','s','s','e','e','s')), include.rownames=FALSE) # thesis
      } else { 
        cat('\nNo enriched terms found\n\n\n')
      }
    
  }
}

plot_shared_genes_ORA = function(module, n){
    for(database in c('GO','DO','DGN','KEGG','Reactome')){
      plot_data = ORA_enrichment[[module]][[database]]@result %>% filter(p.adjust<0.05) %>%
                  arrange(desc(p.adjust)) %>% dplyr::select(ID, geneID) %>% slice_head(n=n)
      
      if(nrow(plot_data)>1){
    
      shared_genes = matrix(0, nrow(plot_data), nrow(plot_data))
      for(i in 1:(nrow(plot_data)-1)){
        for(j in (i+1):nrow(plot_data)){
          gene_set_1 = strsplit(plot_data$core_enrichment[i], '/') %>% unlist %>% unique
          gene_set_2 = strsplit(plot_data$core_enrichment[j], '/') %>% unlist %>% unique
          shared_genes[i,j] = sum(gene_set_1 %in% gene_set_2)/length(unique(c(gene_set_1, gene_set_2)))
          shared_genes[j,i] = shared_genes[i,j]
        }
      }
      rownames(shared_genes) = plot_data$ID
      colnames(shared_genes) = plot_data$ID
  
      corrplot(shared_genes, type = 'lower', method = 'square', diag = FALSE, number.digits = 2, cl.pos = 'n', 
               tl.pos = 'ld', tl.col = '#666666', order = 'hclust', col.lim = c(0,1), addCoef.col = 'black',
               mar = c(0,0,2,0), tl.cex = 0.8, number.cex= 0.8,
               title = paste0('Genes in common in the ',database, ' database for cluster ',
                              genes_info$module_number[genes_info$Module==module][1]))

      }
  }
}


GSEA enrichment results

print_GSEA_top_results(selected_modules[3], 5)
GO:
ID Description NES p.adjust qvalues
GO:0045892 GO:0045892 negative regulation of transcription, DNA-templated 1.655441 0.0490487 0.0026761
DO:
ID Description NES p.adjust qvalues
DOID:0060040 DOID:0060040 pervasive developmental disorder 2.064808 0.0160467 0.0040484
DOID:0060041 DOID:0060041 autism spectrum disorder 2.041408 0.0161937 0.0040484
DOID:12849 DOID:12849 autistic disorder 2.041408 0.0161937 0.0040484
DOID:0060037 DOID:0060037 developmental disorder of mental health 1.968701 0.0073923 0.0040484
DGN:
ID Description NES p.adjust qvalues
umls:C0008074 umls:C0008074 Child Development Disorders, Pervasive 2.208991 0.0360512 0.0050244
umls:C0036857 umls:C0036857 Severe mental retardation (I.Q. 20-34) 2.172130 0.0361686 0.0050244
umls:C0035372 umls:C0035372 Rett Syndrome 2.113475 0.0326233 0.0050244
umls:C0018817 umls:C0018817 Atrial Septal Defects 1.988084 0.0306196 0.0050244
umls:C0014544 umls:C0014544 Epilepsy 1.881564 0.0267545 0.0050244
KEGG:
ID Description NES p.adjust qvalues
hsa04921 hsa04921 Oxytocin signaling pathway 2.187167 0.0043745 0.0008781
hsa04713 hsa04713 Circadian entrainment 2.114976 0.0092065 0.0008781
hsa04970 hsa04970 Salivary secretion 2.031143 0.0334118 0.0016375
hsa04261 hsa04261 Adrenergic signaling in cardiomyocytes 2.015750 0.0177863 0.0010729
hsa04728 hsa04728 Dopaminergic synapse 1.973352 0.0177863 0.0010729
Reactome:
ID Description NES p.adjust qvalues
R-HSA-3214842 R-HSA-3214842 HDMs demethylate histones 2.297315 0.0164876 0.0012422
R-HSA-5578775 R-HSA-5578775 Ion homeostasis 2.268130 0.0159153 0.0012422
R-HSA-445095 R-HSA-445095 Interaction between L1 and Ankyrins 2.229128 0.0167107 0.0012422
R-HSA-73728 R-HSA-73728 RNA Polymerase I Promoter Opening 2.223382 0.0166736 0.0012422
R-HSA-2299718 R-HSA-2299718 Condensation of Prophase Chromosomes 2.191772 0.0490282 0.0014944


Plots of the results when there are more than 2 terms in common between methods:

plot_shared_genes_GSEA(selected_modules[3], 5)



ORA enrichment results

print_ORA_top_results(selected_modules[3], 5)

GO:

No enriched terms found

DO:

No enriched terms found

DGN:

No enriched terms found

KEGG:

No enriched terms found

Reactome:

No enriched terms found





Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] xtable_1.8-4           kableExtra_1.1.0       knitr_1.32            
##  [4] doParallel_1.0.15      iterators_1.0.13       foreach_1.5.1         
##  [7] org.Hs.eg.db_3.8.2     AnnotationDbi_1.46.1   IRanges_2.18.3        
## [10] S4Vectors_0.22.1       Biobase_2.44.0         BiocGenerics_0.30.0   
## [13] DOSE_3.10.2            ReactomePA_1.28.0      clusterProfiler_3.12.0
## [16] biomaRt_2.40.5         polycor_0.7-10         expss_0.10.7          
## [19] WGCNA_1.69             fastcluster_1.2.3      dynamicTreeCut_1.63-1 
## [22] ggExtra_0.9            ggpubr_0.2.5           magrittr_2.0.1        
## [25] GGally_1.5.0           corrplot_0.90          colorspace_2.0-2      
## [28] gridExtra_2.3          viridis_0.6.1          viridisLite_0.4.0     
## [31] RColorBrewer_1.1-2     dendextend_1.15.1      plotly_4.9.2          
## [34] glue_1.4.2             reshape2_1.4.4         forcats_0.5.0         
## [37] stringr_1.4.0          dplyr_1.0.1            purrr_0.3.4           
## [40] readr_1.3.1            tidyr_1.1.0            tibble_3.1.2          
## [43] ggplot2_3.3.5          tidyverse_1.3.0       
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                  tidyselect_1.1.1           
##   [3] RSQLite_2.2.0               htmlwidgets_1.5.3          
##   [5] grid_3.6.3                  BiocParallel_1.18.1        
##   [7] munsell_0.5.0               codetools_0.2-16           
##   [9] preprocessCore_1.46.0       miniUI_0.1.1.1             
##  [11] withr_2.4.2                 GOSemSim_2.10.0            
##  [13] highr_0.9                   rstudioapi_0.13            
##  [15] ggsignif_0.6.2              labeling_0.4.2             
##  [17] urltools_1.7.3              GenomeInfoDbData_1.2.1     
##  [19] polyclip_1.10-0             bit64_4.0.5                
##  [21] farver_2.1.0                vctrs_0.3.8                
##  [23] generics_0.1.0              xfun_0.25                  
##  [25] GenomeInfoDb_1.20.0         R6_2.5.1                   
##  [27] graphlayouts_0.7.0          locfit_1.5-9.4             
##  [29] DelayedArray_0.10.0         bitops_1.0-7               
##  [31] cachem_1.0.6                reshape_0.8.8              
##  [33] fgsea_1.10.1                gridGraphics_0.5-1         
##  [35] assertthat_0.2.1            promises_1.2.0.1           
##  [37] scales_1.1.1                ggraph_2.0.3               
##  [39] nnet_7.3-14                 enrichplot_1.4.0           
##  [41] gtable_0.3.0                tidygraph_1.2.0            
##  [43] rlang_0.4.11                genefilter_1.66.0          
##  [45] splines_3.6.3               lazyeval_0.2.2             
##  [47] acepack_1.4.1               impute_1.58.0              
##  [49] broom_0.7.0                 europepmc_0.4              
##  [51] checkmate_2.0.0             BiocManager_1.30.16        
##  [53] yaml_2.2.1                  modelr_0.1.6               
##  [55] crosstalk_1.1.1             backports_1.2.1            
##  [57] httpuv_1.6.1                qvalue_2.16.0              
##  [59] Hmisc_4.4-0                 tools_3.6.3                
##  [61] ggplotify_0.1.0             ellipsis_0.3.2             
##  [63] jquerylib_0.1.4             ggridges_0.5.3             
##  [65] Rcpp_1.0.7                  plyr_1.8.6                 
##  [67] zlibbioc_1.30.0             base64enc_0.1-3            
##  [69] progress_1.2.2              RCurl_1.98-1.4             
##  [71] prettyunits_1.1.1           rpart_4.1-15               
##  [73] cowplot_1.1.1               SummarizedExperiment_1.14.1
##  [75] haven_2.2.0                 ggrepel_0.9.1              
##  [77] cluster_2.1.0               fs_1.5.0                   
##  [79] data.table_1.14.0           DO.db_2.9                  
##  [81] reactome.db_1.68.0          triebeard_0.3.0            
##  [83] reprex_0.3.0                matrixStats_0.60.1         
##  [85] hms_1.1.0                   mime_0.11                  
##  [87] evaluate_0.14               XML_3.99-0.3               
##  [89] jpeg_0.1-9                  readxl_1.3.1               
##  [91] compiler_3.6.3              crayon_1.4.1               
##  [93] htmltools_0.5.2             later_1.3.0                
##  [95] Formula_1.2-4               geneplotter_1.62.0         
##  [97] lubridate_1.7.10            DBI_1.1.1                  
##  [99] tweenr_1.0.2                dbplyr_1.4.2               
## [101] rappdirs_0.3.3              MASS_7.3-53                
## [103] Matrix_1.2-18               cli_3.0.1                  
## [105] igraph_1.2.6                GenomicRanges_1.36.1       
## [107] pkgconfig_2.0.3             rvcheck_0.1.8              
## [109] foreign_0.8-76              xml2_1.3.2                 
## [111] annotate_1.62.0             bslib_0.3.0                
## [113] XVector_0.24.0              webshot_0.5.2              
## [115] rvest_0.3.5                 yulab.utils_0.0.2          
## [117] digest_0.6.27               graph_1.62.0               
## [119] rmarkdown_2.7               cellranger_1.1.0           
## [121] fastmatch_1.1-3             htmlTable_1.13.3           
## [123] curl_4.3.2                  shiny_1.6.0                
## [125] graphite_1.30.0             lifecycle_1.0.0            
## [127] jsonlite_1.7.2              fansi_0.5.0                
## [129] pillar_1.6.2                lattice_0.20-41            
## [131] fastmap_1.1.0               httr_1.4.2                 
## [133] survival_3.2-7              GO.db_3.8.2                
## [135] UpSetR_1.4.0                png_0.1-7                  
## [137] bit_4.0.4                   ggforce_0.3.1              
## [139] stringi_1.7.4               sass_0.4.0                 
## [141] blob_1.2.2                  DESeq2_1.24.0              
## [143] latticeExtra_0.6-29         memoise_2.0.0